Data Visualization Project 02

Load libraries

library(tidyverse)
library(sf)
library(ggrepel)
library(plotly)
library(ggtext)
library(ggpubr)

Read data

marathon <- read_csv("../data/marathon_results_2017.csv")
flCounties <- st_read("../data/Florida_Counties/Florida_Counties.shp", quiet = T)
usCities <- read_csv("../data/uscities.csv") 

Data Wrangling

# check data
head(marathon)
## # A tibble: 6 × 22
##   Bib   Name     Age `M/F` City  State Country `5K`   `10K` `15K` `20K` Half    
##   <chr> <chr>  <dbl> <chr> <chr> <chr> <chr>   <time> <chr> <chr> <chr> <time>  
## 1 11    Kirui…    24 M     Keri… <NA>  KEN     15'25" 0:30… 0:45… 1:01… 01:04:35
## 2 17    Rupp,…    30 M     Port… OR    USA     15'24" 0:30… 0:45… 1:01… 01:04:35
## 3 23    Osako…    25 M     Mach… <NA>  JPN     15'25" 0:30… 0:45… 1:01… 01:04:36
## 4 21    Biwot…    32 M     Mamm… CA    USA     15'25" 0:30… 0:45… 1:01… 01:04:45
## 5 9     Chebe…    31 M     Mara… <NA>  KEN     15'25" 0:30… 0:45… 1:01… 01:04:35
## 6 15    Abdir…    40 M     Phoe… AZ    USA     15'25" 0:30… 0:45… 1:01… 01:04:35
## # … with 10 more variables: `25K` <time>, `30K` <time>, `35K` <time>,
## #   `40K` <time>, Pace <time>, `Proj Time` <chr>, `Official Time` <time>,
## #   Overall <dbl>, Gender <dbl>, Division <dbl>

I want to focus on Florida

# get just people from Florida
flMar <- filter(marathon, State == "FL") 
# check data
head(flMar)
## # A tibble: 6 × 22
##   Bib   Name     Age `M/F` City  State Country `5K`   `10K` `15K` `20K` Half    
##   <chr> <chr>  <dbl> <chr> <chr> <chr> <chr>   <time> <chr> <chr> <chr> <time>  
## 1 148   Vanos…    45 M     Orla… FL    USA     17'42" 0:35… 0:53… 1:11… 01:15:08
## 2 358   Byers…    36 M     Miami FL    USA     18'14" 0:36… 0:55… 1:13… 01:17:25
## 3 470   Shold…    36 M     Jack… FL    USA     18'20" 0:36… 0:55… 1:13… 01:17:42
## 4 333   Sloan…    29 M     Jack… FL    USA     18'07" 0:36… 0:54… 1:12… 01:15:59
## 5 2123  Hartj…    24 M     Nept… FL    USA     20'21" 0:40… 1:00… 1:19… 01:24:14
## 6 176   Huber…    38 M     Miam… FL    USA     18'08" 0:36… 0:54… 1:13… 01:17:27
## # … with 10 more variables: `25K` <time>, `30K` <time>, `35K` <time>,
## #   `40K` <time>, Pace <time>, `Proj Time` <chr>, `Official Time` <time>,
## #   Overall <dbl>, Gender <dbl>, Division <dbl>
# change variable names to make them easier for using
names(flMar) <- make.names(names(flMar))
# remove any rows that could have been duplicated
flMar <- distinct(flMar)
# select just columns I might use
flMar <- select(flMar, c("Age", "M.F", "City", "State", "Official.Time", "Overall", "Gender"))
# check for NAs
colSums(is.na(flMar))
##           Age           M.F          City         State Official.Time 
##             0             0             0             0             0 
##       Overall        Gender 
##             0             0
# check data
flMar
## # A tibble: 609 × 7
##      Age M.F   City             State Official.Time Overall Gender
##    <dbl> <chr> <chr>            <chr> <time>          <dbl>  <dbl>
##  1    45 M     Orlando          FL    02:34:40           94     84
##  2    36 M     Miami            FL    02:35:44          104     92
##  3    36 M     Jacksonville     FL    02:36:40          122    106
##  4    29 M     Jacksonville     FL    02:40:03          188    167
##  5    24 M     Neptune Beach    FL    02:46:56          416    386
##  6    38 M     Miami Beach      FL    02:47:06          424    393
##  7    37 M     Tallahassee      FL    02:47:31          448    417
##  8    27 M     Miami            FL    02:47:35          453    421
##  9    30 M     Jacksonville     FL    02:49:21          536    499
## 10    34 M     Saint Petersburg FL    02:49:31          549    512
## # … with 599 more rows

This dataset will only be used to find the county a city is in

# narrow dataset down to just Florida and needed columns
flCities <- usCities %>% 
  filter(state_id == "FL") %>% 
  select(c("city", "county_name"))
# add county name to marathon data check for failed matches
left_join(flMar, flCities, c("City" = "city")) %>% 
  filter(is.na(county_name)) 
## # A tibble: 67 × 8
##      Age M.F   City             State Official.Time Overall Gender county_name
##    <dbl> <chr> <chr>            <chr> <time>          <dbl>  <dbl> <chr>      
##  1    34 M     Saint Petersburg FL    02:49:31          549    512 <NA>       
##  2    43 M     St Petersburg    FL    02:52:16          711    669 <NA>       
##  3    42 F     Lowell           FL    03:03:01         1772    120 <NA>       
##  4    39 M     Land O Lakes     FL    03:09:06         2626   2392 <NA>       
##  5    40 M     Saint Augustine  FL    03:15:53         3833   3377 <NA>       
##  6    47 M     St Johns         FL    03:16:03         3861   3399 <NA>       
##  7    37 M     Lake Worth       FL    03:16:30         3950   3470 <NA>       
##  8    36 M     Port Saint Lucie FL    03:19:24         4533   3913 <NA>       
##  9    41 M     Saint Johns      FL    03:20:00         4654   4005 <NA>       
## 10    41 F     Saint Petersburg FL    03:25:21         5812    988 <NA>       
## # … with 57 more rows
# Converting and fixing city names to match names in flCities
flMar$City <- str_replace(flMar$City, "Saint", "St.")
flMar$City <- str_replace(flMar$City, "St ", "St. ")
flMar$City <- str_replace(flMar$City, " O ", " O' ")
flMar$City <- str_replace(flMar$City, " Locka", "-locka")
flMar$City <- str_replace(flMar$City, "Ft.", "Fort ")
flMar$City <- str_replace(flMar$City, "Fort  ", "Fort ") # one fort had a extra space
flMar$City <- str_replace(flMar$City, "Deland", "DeLand")
flMar$City <- str_replace(flMar$City, "Augustime", "Augustine")
# check for failed matches again
left_join(flMar, flCities, c("City" = "city")) %>% 
  filter(is.na(county_name)) %>% 
  group_by(City) %>% 
  summarize(n = n())
## # A tibble: 17 × 2
##    City                     n
##    <chr>                <int>
##  1 Amelia Island            1
##  2 Clearwater Beach         1
##  3 Deleon Springs           1
##  4 Lake Worth               3
##  5 Lithia                   2
##  6 Lowell                   1
##  7 Palm Beach Shore         1
##  8 Parrish                  1
##  9 Pensacola Beach          2
## 10 Ponte Vedra              1
## 11 Ponte Vedra Beach        4
## 12 Santa Rosa Beach         4
## 13 St. Johns                8
## 14 Tegucigalpa Honduras     1
## 15 Viera                    1
## 16 Watersound               1
## 17 West Palm                1

These cities were not found in the flCities data. Due to time constraints and the small amount of these cities with missing counties, their county will not be manually found and added.

# add counties to marathon data for rows with city matches
flMarCounties <- inner_join(flMar, flCities, c("City" = "city"))
# change county column name to match flCounties country column
flMarCounties <- rename(flMarCounties,  "COUNTYNAME" = "county_name")
flMarCounties
## # A tibble: 575 × 8
##      Age M.F   City           State Official.Time Overall Gender COUNTYNAME
##    <dbl> <chr> <chr>          <chr> <time>          <dbl>  <dbl> <chr>     
##  1    45 M     Orlando        FL    02:34:40           94     84 Orange    
##  2    36 M     Miami          FL    02:35:44          104     92 Miami-Dade
##  3    36 M     Jacksonville   FL    02:36:40          122    106 Duval     
##  4    29 M     Jacksonville   FL    02:40:03          188    167 Duval     
##  5    24 M     Neptune Beach  FL    02:46:56          416    386 Duval     
##  6    38 M     Miami Beach    FL    02:47:06          424    393 Miami-Dade
##  7    37 M     Tallahassee    FL    02:47:31          448    417 Leon      
##  8    27 M     Miami          FL    02:47:35          453    421 Miami-Dade
##  9    30 M     Jacksonville   FL    02:49:21          536    499 Duval     
## 10    34 M     St. Petersburg FL    02:49:31          549    512 Pinellas  
## # … with 565 more rows
# make flCounties county capitalization match flMarCounties
flCounties$COUNTYNAME <- str_to_title(flCounties$COUNTYNAME)
# match this county name
flCounties$COUNTYNAME <- replace(flCounties$COUNTYNAME, flCounties$COUNTYNAME == "Dade", "Miami-Dade")
# add the counties from the shapefile that had no runners and put a 0 for their number of runners.
flMarCounties <- flMarCounties %>% 
  group_by(COUNTYNAME) %>% 
  summarise(n = n()) %>% 
  right_join(flCounties, "COUNTYNAME") %>% 
  mutate(n = ifelse(is.na(n), 0, n)) %>% 
  select(c("COUNTYNAME", "n"))
arrange(flMarCounties, desc(n))
## # A tibble: 67 × 2
##    COUNTYNAME       n
##    <chr>        <dbl>
##  1 Miami-Dade      82
##  2 Hillsborough    48
##  3 Palm Beach      47
##  4 Orange          46
##  5 Broward         41
##  6 Duval           38
##  7 Pinellas        32
##  8 Pasco           22
##  9 Seminole        19
## 10 Brevard         18
## # … with 57 more rows

Data Visualizations

ggplotly(ggplot(flMar) +
  geom_boxplot(aes(M.F, Overall, fill = M.F, color = M.F)) + 
  scale_fill_manual(values = c(F = "lightcoral", M = "lightskyblue"), guide = "none") + 
  scale_color_manual(values = c(F = "Red", M = "Blue"), guide = "none") +
  scale_y_reverse() + # also makes the numbers negative
  labs(title = "<span style = 'color:Blue;'>Male</span> and <span style='color:Red;'>Female</span> Overall Rank for 2017 Boston Marathon Finishers from Florida") +
  theme_minimal() +
  theme(plot.title = element_markdown(), axis.title = element_blank(), axis.text.x = element_blank(), legend.position = "none"))

The best and worst finishers are similar for both genders, but on average males performed better.

# get coordinates for county labeling
flCountiesPoints <- st_centroid(flCounties)
flCountiesPoints <- cbind(flCountiesPoints, st_coordinates(st_centroid(flCountiesPoints$geometry)))

# filter just counties with over 20 finishers
# and split into left and right side for finer control on map
leftPoints <- filter(flCountiesPoints, COUNTYNAME %in% c("Hillsborough", "Pinellas", "Pasco"))
rightPoints <- filter(flCountiesPoints, COUNTYNAME %in% c("Miami-Dade","Palm Beach", "Orange", "Broward", "Duval"))

# spatial visualization
merge(flCounties, flMarCounties, by = "COUNTYNAME") %>% 
  ggplot() +
    geom_sf(aes(fill = n),alpha=0.8, col="white") +
    scale_fill_viridis_c(name = "") +
    geom_text_repel(data = leftPoints, aes(x = X, y = Y, label = COUNTYNAME), nudge_x = -1.5, min.segment.length = 0) + 
    geom_text_repel(data = rightPoints, aes(x = X, y = Y, label = COUNTYNAME), nudge_x = 1.5, min.segment.length = 0) + 
    annotate("text", -Inf, Inf, label = "82 finishers", hjust = -6.55, vjust = 36.5, size = 3) +
    labs(title = "2017 Boston Marathon Finishers by County in Florida", subtitle = "Counties with at least 20 finishers are labeled") +
    theme_void() +
    theme(plot.title = element_markdown(), plot.subtitle = element_markdown(), legend.position = "left")

It is interesting to see most finishers are from southeast Florida with a significant amount in central Florida, and Miami-Dade having a lot more than the rest.

ggplot(flMar, aes(Age, Official.Time, color = M.F)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm") +
  scale_color_manual(values = c(F = "Red", M = "Blue"), guide_legend(title = "Gender")) +
  stat_regline_equation(aes(label = after_stat(rr.label)), show.legend = F) +
  labs(title = "Age vs Official Time of 2017 Boston Marathon Finishers from Florida") +
  xlab("Age (Year)") +
  ylab("Official Time (HH:MM:SS)") +
  theme_minimal() +
  theme(plot.title = element_markdown())
## `geom_smooth()` using formula = 'y ~ x'

The correlation between age and the official time achieved is not strong. But the linear model does suggest that older people take longer to finish and more so for males.


Report

My process started started with choosing a dataset. I decided to go with the 2017 Boston Marathon Finishers because it has a good amount of data, multiple variables that I could use, and it has city and state data that could be combined with a shapefile to make a spatial visualization. The dataset was decently formatted and I did not have to do much cleaning. Main changes made were renaming the columns for easier use, creating a subset of just the finishers from Florida with the columns I was going to use, and checking for missing values, which my subset had none of. I was able to make my originally planned charts which were an interactive boxplot, a spatial visualization of Florida counties, and a scatterplot with a linear model.

The interactive boxplot was the first graph I made. It was the gender of the finisher vs their overall rank. For this graph, I tried to keep it simple, so the only labels are the title and the rank values on the y axis. The title has colored words in it to match the gender for each boxplot. I also reversed the y axis, so that 0 is at the top. I think that this makes more sense as a rank closer to 0 is a better performance and I think better numbers should be at the top for a graph. One problem with this though is that hovering over the boxplot after making it interactive through plotly, shows a negative sign in front of the statistics for it. I could not find a solution for this, and I also had a problem where hjust was not working for this boxplot. However, the graph still looked fine and is able to clearly convey the information I choose from the dataset. The best and worst finishers are similar for both genders, but on average males performed better.

My next graph, the spatial visualization was difficult to make work the way I wanted. It shows the number of finishers per county in Florida, and I labeled the counties with over 20 finishers. My main issue is that I have the city of the finisher from marathon dataset but not their county. So, I found a freely available dataset that has us cities and their counties. I filtered that data down to just Florida cities and did a left join on it with my marathon data to check for missing matches. A large number of failed matches were due to inconsistent spelling and typos in the city name for the marathon data. After fixing those, I did an inner join to get just the matches, as there was only a small amount of cities left with no counties found for it in the US cities dataset. Then I had to do column renaming and changing a city name so that I could merge it with shapefile for Florida counties. I grouped my data by counties and counting the number of occurrences of county and merged that into shapefile for the Florida counties. Finally, with that combined data, I was able to make the spatial visualization. Using it, I saw that most finishers are from southeast Florida with a significant amount in central Florida, and Miami-Dade having a lot more than the rest.

My final graph was a scatterplot of the age vs official time for the finishers. The points were colored by gender and I added a linear model for both as well as their R-squared value. This was easy to make through the cleaned data I made before this. My only difficulty was adding the R-squared value, and I used stat_regline_equation from the ggpubr package to make it easier. Also, to make it easier to see the points at cluttered areas, I lowered the alpha of the points. Contrary to my previous graphs, I kept the axis titles to help explain their value types as I thought they were less clear then the others and putting the value types in the title would make my already long title too long. This graph showed that the correlation between age and the official time achieved is not strong. But the linear model does suggest that older people take longer to finish and more so for males.

Overall, for my graphs, I tried to keep them consistent. I removed any information I believed was unnecessary and repetitive. I kept the colors as consistent as I could. For both my boxplot and scatterplot that had gender in them, male was blue, and females were red. The only graph with different colors was the spatial visualization as it has a continuous variable, and I used the Viridis color palette for it, as it colors for it help show similarities and differences well without overemphasis.